#imports
import calendar
import folium
import folium.plugins
import matplotlib.pyplot as plt
import missingno as msno
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import scikitplot as skplot
import seaborn as sns
from bs4 import BeautifulSoup
from matplotlib import cm
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.manifold import TSNE
from umap import UMAP
from urllib.request import urlopen
from wordcloud import WordCloud
pd.options.display.max_columns = None
sns.set_style("darkgrid")
# Dataset from: https://www.kaggle.com/dansbecker/melbourne-housing-snapshot
df = pd.read_csv('melb_data.csv')
df.head(5)
| Suburb | Address | Rooms | Type | Price | Method | SellerG | Date | Distance | Postcode | Bedroom2 | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Abbotsford | 85 Turner St | 2 | h | 1480000.0 | S | Biggin | 3/12/2016 | 2.5 | 3067.0 | 2.0 | 1.0 | 1.0 | 202.0 | NaN | NaN | Yarra | -37.7996 | 144.9984 | Northern Metropolitan | 4019.0 |
| 1 | Abbotsford | 25 Bloomburg St | 2 | h | 1035000.0 | S | Biggin | 4/02/2016 | 2.5 | 3067.0 | 2.0 | 1.0 | 0.0 | 156.0 | 79.0 | 1900.0 | Yarra | -37.8079 | 144.9934 | Northern Metropolitan | 4019.0 |
| 2 | Abbotsford | 5 Charles St | 3 | h | 1465000.0 | SP | Biggin | 4/03/2017 | 2.5 | 3067.0 | 3.0 | 2.0 | 0.0 | 134.0 | 150.0 | 1900.0 | Yarra | -37.8093 | 144.9944 | Northern Metropolitan | 4019.0 |
| 3 | Abbotsford | 40 Federation La | 3 | h | 850000.0 | PI | Biggin | 4/03/2017 | 2.5 | 3067.0 | 3.0 | 2.0 | 1.0 | 94.0 | NaN | NaN | Yarra | -37.7969 | 144.9969 | Northern Metropolitan | 4019.0 |
| 4 | Abbotsford | 55a Park St | 4 | h | 1600000.0 | VB | Nelson | 4/06/2016 | 2.5 | 3067.0 | 3.0 | 1.0 | 2.0 | 120.0 | 142.0 | 2014.0 | Yarra | -37.8072 | 144.9941 | Northern Metropolitan | 4019.0 |
#data types of each feature
df.dtypes
Suburb object Address object Rooms int64 Type object Price float64 Method object SellerG object Date object Distance float64 Postcode float64 Bedroom2 float64 Bathroom float64 Car float64 Landsize float64 BuildingArea float64 YearBuilt float64 CouncilArea object Lattitude float64 Longtitude float64 Regionname object Propertycount float64 dtype: object
# checking for null values
df.isnull().sum()
Suburb 0 Address 0 Rooms 0 Type 0 Price 0 Method 0 SellerG 0 Date 0 Distance 0 Postcode 0 Bedroom2 0 Bathroom 0 Car 62 Landsize 0 BuildingArea 6450 YearBuilt 5375 CouncilArea 1369 Lattitude 0 Longtitude 0 Regionname 0 Propertycount 0 dtype: int64
# converting the date column to 'datetime'
# the dataset consists of housing sales between 2016-2017
df['Date'] = pd.to_datetime(df['Date'])
#creating columns for the date and month the apartment was sold
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].apply(lambda x: calendar.month_abbr[x])
#real estate count per year and month
df.groupby(['Year', 'Month'])['Year'].count()
#dropping columns of no significance
df.drop(columns=['YearBuilt', 'Postcode', 'Address'], inplace=True)
#filling null values for council area with 'unavailable'
df['CouncilArea'] = df.CouncilArea.fillna('Unavailable')
#dropping observations with null values for car
df.dropna(subset=['Car'], inplace=True)
df.groupby(['BuildingArea', 'Suburb'])['Suburb'].count()
BuildingArea Suburb
0.0 Balwyn North 2
Brighton East 1
Bundoora 2
Craigieburn 1
Epping 1
..
1561.0 Elsternwick 1
3112.0 Camberwell 1
3558.0 Fitzroy North 1
6791.0 Bulleen 1
44515.0 New Gisborne 1
Name: Suburb, Length: 5984, dtype: int64
df.describe()
| Rooms | Price | Distance | Bedroom2 | Bathroom | Car | Landsize | BuildingArea | Lattitude | Longtitude | Propertycount | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 13518.000000 | 1.351800e+04 | 13518.000000 | 13518.000000 | 13518.00000 | 13518.000000 | 13518.000000 | 7101.000000 | 13518.000000 | 13518.000000 | 13518.000000 | 13518.000000 |
| mean | 2.939784 | 1.074796e+06 | 10.157827 | 2.916408 | 1.53536 | 1.610075 | 558.110593 | 152.128903 | -37.809191 | 144.995306 | 7455.482986 | 2016.531292 |
| std | 0.956438 | 6.398586e+05 | 5.861593 | 0.966692 | 0.69231 | 0.962634 | 3998.194560 | 542.109910 | 0.079366 | 0.104003 | 4381.437721 | 0.499038 |
| min | 1.000000 | 8.500000e+04 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | -38.182550 | 144.431810 | 249.000000 | 2016.000000 |
| 25% | 2.000000 | 6.500000e+05 | 6.200000 | 2.000000 | 1.00000 | 1.000000 | 178.000000 | 93.000000 | -37.857000 | 144.929425 | 4380.000000 | 2016.000000 |
| 50% | 3.000000 | 9.010000e+05 | 9.200000 | 3.000000 | 1.00000 | 2.000000 | 442.500000 | 126.000000 | -37.802300 | 145.000300 | 6567.000000 | 2017.000000 |
| 75% | 3.000000 | 1.328000e+06 | 13.000000 | 3.000000 | 2.00000 | 2.000000 | 651.000000 | 174.000000 | -37.756203 | 145.058600 | 10331.000000 | 2017.000000 |
| max | 10.000000 | 9.000000e+06 | 48.100000 | 20.000000 | 8.00000 | 10.000000 | 433014.000000 | 44515.000000 | -37.408530 | 145.526350 | 21650.000000 | 2017.000000 |
#minimum price value is $85000 and maximum value is $9000000
#removing outliers (top and bottom 1 percentiles) for price and property count
columns_list = ['Price','Propertycount']
for i in columns_list:
print(i)
df = df[(df[i] < np.percentile(df[i],99.5)) & (df[i] > np.percentile(df[i], 1))]
print(df[i].describe())
# df = df[(df['Price'] < np.percentile(df['Price'],99.5)) & (df['Price'] > np.percentile(df['Price'], 1))]
df[df['Bathroom'] == 0].count()
df[df['Bedroom2'] == 0].count()
#removing observations with bathroom and bedroom values equal to 0
df = df[(df['Bedroom2'] != 0) & (df['Bathroom'] != 0)]
#remove landsize outlier
df.drop(index=11020, axis=0, inplace=True)
# reset_index
df.reset_index(inplace=True)
del df['index']
Price count 1.329700e+04 mean 1.064641e+06 std 5.763645e+05 min 3.010000e+05 25% 6.510000e+05 50% 9.050000e+05 75% 1.325000e+06 max 3.860000e+06 Name: Price, dtype: float64 Propertycount count 12808.000000 mean 7138.232042 std 3715.017497 min 852.000000 25% 4380.000000 50% 6543.000000 75% 9264.000000 max 17496.000000 Name: Propertycount, dtype: float64
# Check missing values
msno.matrix(df)
<AxesSubplot:>
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/jeffrey/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /Users/jeffrey/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
url = "https://www.aph.gov.au/Parliamentary_Business/Committees/Senate/Former_Committees/hsaf/report/c04"
html = urlopen(url).read()
soup = BeautifulSoup(html)
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = 'n'.join(chunk for chunk in chunks if chunk)
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
wordsFiltered = [word.lower() for word in words if word.isalpha()]
filtered_words = [word for word in wordsFiltered if word not in stopwords.words('english')]
wc = WordCloud(max_words=1000, margin=10, background_color='white',
scale=3, relative_scaling = 0.5, width=500, height=400,
random_state=1).generate(' '.join(filtered_words))
plt.figure(figsize=(10,5))
plt.imshow(wc)
plt.axis("off")
plt.show()
url = "https://www.aph.gov.au/Parliamentary_Business/Committees/Senate/Former_Committees/hsaf/report/c05"
html = urlopen(url).read()
soup = BeautifulSoup(html)
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = 'n'.join(chunk for chunk in chunks if chunk)
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
wordsFiltered = [word.lower() for word in words if word.isalpha()]
filtered_words = [word for word in wordsFiltered if word not in stopwords.words('english')]
wc = WordCloud(max_words=1000, margin=10, background_color='white',
scale=3, relative_scaling = 0.5, width=500, height=400,
random_state=1).generate(' '.join(filtered_words))
plt.figure(figsize=(10,5))
plt.imshow(wc)
plt.axis("off")
plt.show()
# Correlation Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap='RdBu_r')
plt.show()
# Histogram of features
df.hist(figsize = (14,14), bins = 20, color="cornflowerblue")
plt.show()
# Violinplots - showcase density and distribtuion of prices
types = ['House', 'Unit', 'Townhouse']
plt.figure(figsize=(10,6))
ax = sns.boxplot(y="Price",x ='Type' ,data = df)
ax.set_xticklabels(types)
plt.title("Price Distribution by Property Type", fontsize=20)
plt.xlabel('Neighbourhood Group',fontsize=15)
plt.ylabel("Price",fontsize=15);
# Looking at how the quantity of each property type
ax = sns.countplot(x=df['Type'], palette="plasma")
ax.set_xticklabels(types)
fig = plt.gcf()
fig.set_size_inches(10,5)
plt.title('Total Number of Listings per Property Type',fontsize = 20)
plt.xlabel('Type of Property',fontsize = 15)
plt.ylabel('Count of Property',fontsize = 15)
plt.tick_params(labelsize=10)
# Neighbourhoods and their property type count
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='Regionname', hue='Type', palette='GnBu_d')
plt.title('Number of Listings in Regions by Property Type', fontsize=20)
plt.xlabel('Neighbourhood group',fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.legend(frameon=False, fontsize=10,labels=types, loc="upper right")
plt.xticks(rotation=35, ha="right")
plt.tick_params(labelsize=10)
df_2016 = df[df.Year == 2016]
df_2017 = df[df.Year == 2017]
df_2017= df_2017[['Month', 'Price', 'Rooms', 'Propertycount', 'Type']]
df_2016= df_2016[['Month', 'Price', 'Rooms', 'Propertycount', 'Type']]
plt.figure(figsize=(10,5))
sns.lineplot(data=df_2016, x="Month", y="Price")
plt.title('2016 Prices', fontsize=20)
Text(0.5, 1.0, '2016 Prices')
plt.figure(figsize=(10,5))
sns.lineplot(data=df_2017, x="Month", y="Price")
plt.title('2017 Prices', fontsize=20)
Text(0.5, 1.0, '2017 Prices')
plt.figure(figsize=(10,5))
sns.countplot(data=df_2016, x='Month', hue='Type', palette='flare')
plt.title('Number of Apartment Types in 2016', fontsize=20)
plt.xlabel('Month',fontsize=15)
plt.ylabel("Number of Listings",fontsize=15)
plt.legend(frameon=False, fontsize=10,labels=types, loc="upper right")
plt.xticks(rotation=35, ha="right")
plt.tick_params(labelsize=10)
plt.figure(figsize=(10,5))
sns.countplot(data=df_2017, x='Month', hue='Type')
plt.title('Number of Apartment Types in 2017', fontsize=20)
plt.xlabel('Month',fontsize=15)
plt.ylabel("Number of Listings",fontsize=15)
plt.legend(frameon=False, fontsize=10,labels=types, loc="upper right")
plt.xticks(rotation=35, ha="right")
plt.tick_params(labelsize=10)
# Plotting the properties by latitude and longitude
data = df.copy()
data['Type'] = data['Type'].map({'h': 'House', 'u': 'Unit', 't':'Townhouse'})
plt.figure(figsize=(10,10))
a = sns.scatterplot(data=data, x='Longtitude', y='Lattitude', hue='Type', alpha=0.7, legend='full')
plt.title('Geographic Map of Listings by Property Type', fontsize=20)
plt.xlabel('Lattitude',fontsize=12)
plt.ylabel("Longtitude",fontsize=12)
plt.tick_params(labelsize=10)
# Display heatmap on map of melbourne.
my_heatmap = folium.Map(location=[37.8136,144.9631], zoom_start=8)
heat_data = [[row['Lattitude'],row['Longtitude']] for index, row in df.iterrows()]
# Plot it on the map
folium.plugins.HeatMap(heat_data, min_opacity = 0.6, blur=30).add_to(my_heatmap)
# Display the map
sw = df[['Lattitude', 'Longtitude']].min().values.tolist()
ne = df[['Lattitude', 'Longtitude']].max().values.tolist()
my_heatmap.fit_bounds([sw, ne])
my_heatmap
locations = df[['Lattitude', 'Longtitude']]
locationlist = locations.values.tolist()
len(locationlist)
locationlist
map2 = folium.Map(location=[-37.8136,144.9631], zoom_start=10, tiles='Stamen Terrain')
marker_cluster = folium.plugins.MarkerCluster().add_to(map2)
for point in range(0, len(locationlist)):
folium.Marker(locationlist[point], popup=df['Regionname'][point]).add_to(marker_cluster)
map2